OSD Mamiello - Sci Rep 2018

1 Aim

  • Analysis of OSD Mamiello data for Sci Report paper 2018

2 Initialize.

This file defines all the necessary libraries and variables

  source('OSD_Mamiello_init.R', echo=FALSE)

3 Read data

3.1 Read excel file

file_supp <- "../Supplementary data/OSD_Mamiello_Tables Supplementary.xlsx"
otu <- read_excel(path = file_supp, sheet = "otus LGC Mamiello")
samples <- read_excel(path = file_supp, sheet = "samples")
metadata <- read_excel(path = file_supp, sheet = "metadata")

4 Summarize at species and ASV level

There are two types of species assignement

  • species_wang is the the species assigned by the Wang classifier to all ASV
  • species_ASV is the species assigned to the major ASV using careful phylogenetic analysis

4.1 Compute different dataframes

4.1.1 For all samples and all OTUs

In this paper we consider all samples not just surface samples

samples_surface <- samples %>% filter(surface_sample >= 0)
sample_number <- nrow(samples_surface)
glue::glue("Number of samples: {sample_number}")
Number of samples: 157
otu_long <- otu %>% select(ASV_code, genus, species_wang, species_ASV, contains("OSD")) %>% 
    gather(sample, n_seq, contains("OSD")) %>% filter(sample %in% samples_surface$sample)
otu_long$n_seq <- otu_long$n_seq %>% replace_na(0)

otu_seq <- otu_long %>% group_by(ASV_code) %>% summarise(n_seq_otu = sum(n_seq)) %>% 
    filter(n_seq_otu > 0)

glue::glue("Number of OTUs for all samples: {nrow(otu_seq)}")
Number of OTUs for all samples: 10715
glue::glue("Number of reads for all samples: {sum(otu_seq$n_seq_otu)}")
Number of reads for all samples: 218484
# write_tsv(species_seq, 'OSD Mamiello sequences.tsv')

sample_seq <- otu_long %>% group_by(sample) %>% summarise(n_seq_sample = sum(n_seq))

sample_number_above100 <- nrow(filter(sample_seq, n_seq_sample >= 100))
glue::glue("Number of samples with more than 100 Mamiellophyceae reads: {sample_number_above100}")
Number of samples with more than 100 Mamiellophyceae reads: 92
otu_long <- otu_long %>% left_join(sample_seq)

4.1.2 Function to define summary for specific groups of otus

summarise_species <- function(otu_long) {
    
    sample_number <- nrow(distinct(as.tibble(otu_long$sample)))
    print(glue::glue("Number of samples taken into account : {sample_number}"))
    
    species_seq <- otu_long %>% group_by(genus, species) %>% summarise(n_seq_species = sum(n_seq))
    
    
    species_samples <- otu_long %>% group_by(species, sample, n_seq_sample) %>% 
        summarise(n_seq = sum(n_seq)) %>% mutate(pct_seq = n_seq/n_seq_sample * 
        100)
    
    species_samples_metadata <- species_samples %>% left_join(select(samples, 
        sample, OSD_station, sample_label)) %>% left_join(metadata) %>% mutate(Latitude = as.numeric(Latitude), 
        Longitude = as.numeric(Longitude))
    
    # Summaries at species level for all species
    
    species_pct <- species_samples %>% group_by(species) %>% summarise(mean_pct_seq_species = mean(pct_seq), 
        max_pct_seq_species = max(pct_seq))
    
    species_presence <- species_samples %>% filter(n_seq > 0) %>% group_by(species) %>% 
        summarise(n_samples_present = n(), pct_samples_present = 100 * n()/sample_number)
    
    species_more_1pct <- species_samples %>% filter(pct_seq > 1) %>% group_by(species) %>% 
        summarise(n_samples_more_1pct = n(), pct_samples_more_1pct = 100 * n()/sample_number)
    
    species_summary <- species_seq %>% left_join(species_pct) %>% left_join(species_presence) %>% 
        left_join(species_more_1pct) %>% filter(!is.na(n_samples_present))  # This to remove the species not present among the selected OTUs
    
    species_stats <- list(summary = species_summary, metadata = species_samples_metadata)
}

4.1.3 For all OTUs at all samples

otu_long1 <- otu_long %>% rename(species = species_wang) %>% select(-species_ASV)
species_stats_1 <- summarise_species(otu_long1)
Number of samples taken into account : 152
species_summary_1 <- species_stats_1[["summary"]]

4.1.4 For selected ASVs at samples with at least 100 reads

otu_long1 <- otu_long %>% filter(n_seq_sample >= 100 & !is.na(species_ASV)) %>% 
    rename(species = species_ASV) %>% select(-species_wang)
species_stats_2 <- summarise_species(otu_long1)
Number of samples taken into account : 92
species_summary_2 <- species_stats_2[["summary"]]

4.2 Graphics at species levels

4.2.1 Define function for Histograms and treemaps

plot_species_summary <- function(species_summary, file_suffix = "1") {
    
    theme_chlorophyta_map <- theme(legend.position = c(0.1, 0.2), legend.background = element_rect(fill = "transparent"), 
        legend.text = element_text(size = 7), legend.title = element_text(size = 9), 
        plot.tag.position = "topright", plot.tag = element_text(size = 24, face = "bold"), 
        plot.title = element_text(size = 14))
    # Number of sequences per species
    g <- ggplot(species_summary, aes(x = reorder(species, n_seq_species), y = n_seq_species)) + 
        geom_col() + coord_flip() + xlab("Species") + ylab("Total of sequences - all samples")
    print(g)
    
    g_treemap <- ggplot(species_summary, aes(area = n_seq_species, fill = genus, 
        label = species, subgroup = genus)) + ggtitle("OSD - number of sequences") + 
        scale_fill_discrete(name = "Genus") + geom_treemap() + geom_treemap_subgroup_border() + 
        geom_treemap_text(colour = "white", place = "topleft", reflow = T, padding.x = grid::unit(3, 
            "mm"), padding.y = grid::unit(3, "mm"))
    print(g_treemap)
    
    ggsave(plot = g_treemap, filename = str_c("pdf/Treemap_sequences_", file_suffix, 
        ".pdf"), width = 10, height = 7, scale = 2, units = "cm", useDingbats = FALSE)
    
    
    treemap_dv(species_summary, c("genus", "species"), "n_seq_species", "OSD - number of sequences")
    
    
    # Mean pct of sequences per species
    
    g <- ggplot(species_summary, aes(area = mean_pct_seq_species, fill = genus, 
        label = species, subgroup = genus)) + ggtitle("OSD - mean of contribution", 
        subtitle = "") + geom_treemap() + geom_treemap_subgroup_border() + geom_treemap_text(colour = "white", 
        place = "topleft", reflow = T)
    print(g)
    
    treemap_dv(species_summary, c("genus", "species"), "mean_pct_seq_species", 
        "OSD - mean of contribution")
    
    # Graph
    
    graph_samples_present <- ggplot(species_summary, aes(x = reorder(species, 
        pct_samples_present), y = pct_samples_present)) + geom_col() + geom_text(aes(label = n_samples_present), 
        hjust = -0.25) + coord_flip() + xlab("Species") + ylab("% of samples where species detected") + 
        ylim(0, 100) + theme_dviz_grid() + theme_chlorophyta_map + labs(title = "", 
        tag = "A")
    
    
    print(graph_samples_present)
    
    graph_samples_more_1pct <- ggplot(species_summary, aes(x = reorder(species, 
        pct_samples_present), y = pct_samples_more_1pct)) + geom_col() + geom_text(aes(label = n_samples_more_1pct), 
        hjust = -0.25) + coord_flip() + xlab("Species") + ylab("% of samples where species represesents \n more than 1% of Mamiellophyceae") + 
        ylim(0, 100) + theme_dviz_grid() + theme_chlorophyta_map + labs(title = "", 
        tag = "B")
    
    print(graph_samples_more_1pct)
    
    grid_graphs <- gridExtra::grid.arrange(grobs = list(graph_samples_present, 
        graph_samples_more_1pct), ncol = 1, nrow = 2, clip = FALSE, padding = unit(0, 
        "line"))
    
    ggsave(plot = grid_graphs, filename = str_c("pdf/Presence_", file_suffix, 
        ".pdf"), width = 15, height = 20, scale = 1.8, units = "cm", useDingbats = FALSE)
    
    # Relation between samples present and samples more than 1 pct
    g <- ggplot(species_summary, aes(x = pct_samples_more_1pct, y = pct_samples_present, 
        label = species)) + geom_point() + geom_text(size = 2, check_overlap = TRUE, 
        vjust = 1) + xlim(0, 100)
    print(g)
    
    # Relation between sequence per species and samples present
    g <- ggplot(species_summary, aes(x = mean_pct_seq_species, y = pct_samples_present, 
        label = species)) + geom_point() + geom_text(size = 2, check_overlap = TRUE, 
        vjust = 1)
    print(g)
}

4.2.2 For all OTUs at all samples

plot_species_summary(species_stats_1[["summary"]], file_suffix = "all_ASV")

4.2.3 For selected OTUs (major ASV) at samples with more than 100 reads

plot_species_summary(species_stats_2[["summary"]], file_suffix = "selected_ASV")

5 Metadata

5.1 Statistics

metadata_summarized <- metadata %>% transmute(Temperature = as.numeric(Temperature), 
    Salinity = as.numeric(Salinity), Nitrates = as.numeric(Nitrates), Phosphates = as.numeric(Phosphates), 
    Chlorophylle_a = as.numeric(Chlorophylle_a)) %>% summarise_all(funs(min, 
    max, mean, median, sd), na.rm = TRUE)
knitr::kable(metadata_summarized)
Temperature_min Salinity_min Nitrates_min Phosphates_min Chlorophylle_a_min Temperature_max Salinity_max Nitrates_max Phosphates_max Chlorophylle_a_max Temperature_mean Salinity_mean Nitrates_mean Phosphates_mean Chlorophylle_a_mean Temperature_median Salinity_median Nitrates_median Phosphates_median Chlorophylle_a_median Temperature_sd Salinity_sd Nitrates_sd Phosphates_sd Chlorophylle_a_sd
-1.6 0.14 0 0 0.07 31 100 12 1.6 75 20 32 2.3 0.23 7.4 20 34 0.77 0.16 2.6 6.3 10 3.3 0.23 13

6 World Maps for each species

# Define constants for the map
size_factor = 1
size_points <- 2.5
size_cross <- 1
color_ice <- "lightslateblue"
color_water <- "red"
color_cultures <- "green"
color_not_present <- "blue"
color_morpho <- "brown"

species_samples_metadata <- species_stats_2[["metadata"]]
species_summary <- species_stats_2[["summary"]]


for (one_genus in c("Bathycoccus", "Ostreococcus", "Micromonas", "Mantoniella")) {
    
    species_selected <- species_summary$species[species_summary$genus == one_genus]
    map_array_main_fig <- list()  # to store the maps to do tiled graph
    map_array_main_fig_eu <- list()  # to store the EU maps to do tiled graph
    
    for (one_species in species_selected) {
        one_species_summary <- species_summary %>% filter(species == one_species)
        
        if (one_species_summary$max_pct_seq_species > 20) {
            species_limits = c(0, 80)
            species_breaks = c(10, 20, 40, 80)
        } else {
            species_limits = c(0, 20)
            species_breaks = c(2, 5, 10, 20)
        }
        
        species.absent <- species_samples_metadata %>% filter(n_seq_sample >= 
            100 & pct_seq < 1 & species == one_species)
        
        species.present <- species_samples_metadata %>% filter(n_seq_sample >= 
            100 & pct_seq >= 1 & species == one_species)
        
        one_species_map <- map_world(color_borders = "grey85") + geom_point(data = species.absent, 
            aes(x = Longitude, y = Latitude), color = color_not_present, size = size_cross, 
            shape = 3) + geom_point(data = species.present, aes(x = Longitude, 
            y = Latitude, size = pct_seq, color = pct_seq, alpha = pct_seq)) + 
            theme(legend.position = c(0.1, 0.2), legend.background = element_rect(fill = "transparent"), 
                legend.text = element_text(size = 7), legend.title = element_text(size = 9)) + 
            scale_size(name = "% of Mamiellophyceae", range = c(0, 5), limits = species_limits, 
                breaks = species_breaks) + scale_alpha_continuous(name = "% of Mamiellophyceae", 
            range = c(0.5, 0.9), limits = species_limits, breaks = species_breaks) + 
            viridis::scale_color_viridis(option = "magma", name = "% of Mamiellophyceae", 
                limits = species_limits, breaks = species_breaks) + guides(colour = guide_legend()) + 
            theme(plot.title = element_text(margin = margin(t = 10, b = 5))) + 
            ggtitle(str_c(one_species, " - ", one_species_summary$n_seq_species, 
                " reads, ", one_species_summary$n_samples_more_1pct, " samples"))
        
        # range gives maximum and minimum size of symbols, limits the extent of the
        # scale replace guide = 'legend' by guide=FALSE to remove the legend....
        # NOT USED : guides(color = guide_legend(override.aes = list(size=5))) +
        
        one_species_map_eu <- one_species_map + coord_fixed(1.3, xlim = c(-40, 
            40), ylim = c(30, 70)) + scale_x_continuous(breaks = (-4:4) * 10) + 
            scale_y_continuous(breaks = (3:7) * 10)
        
        print(one_species_map)
        print(one_species_map_eu)
        
        map_array_main_fig[[one_species]] <- one_species_map
        map_array_main_fig_eu[[one_species]] <- one_species_map_eu
    }
    
    grid_map_main_fig <- gridExtra::grid.arrange(grobs = map_array_main_fig, 
        ncol = 2, nrow = 5, clip = FALSE, padding = unit(0, "line"))
    grid_map_main_fig_eu <- gridExtra::grid.arrange(grobs = map_array_main_fig_eu, 
        ncol = 2, nrow = 5, clip = FALSE, padding = unit(0, "line"))
    
    # Save pdf
    ggsave(plot = grid_map_main_fig, filename = str_c("pdf/Map_", one_genus, 
        ".pdf"), width = 20, height = 35, scale = 2, units = "cm", useDingbats = FALSE)
    ggsave(plot = grid_map_main_fig_eu, filename = str_c("pdf/Map_", one_genus, 
        "_EU.pdf"), width = 20, height = 35, scale = 2, units = "cm", useDingbats = FALSE)
    
}

7 Heatmaps

7.1 Filter the data to remove the class that have less than 1% contribution

species_heatmap_data <- species_samples_metadata %>% ungroup() %>% mutate(sample_code = str_c(Ocean_code, 
    "_", sample_code)) %>% select(sample_label, species, pct_seq) %>% spread(species, 
    pct_seq) %>% tibble::column_to_rownames(var = "sample_label") %>% t()

species_heatmap_data_vertical <- species_samples_metadata %>% ungroup() %>% 
    mutate(sample_label = str_c(Ocean_code, "_", sample_label)) %>% select(sample_label, 
    species, pct_seq) %>% spread(species, pct_seq) %>% tibble::column_to_rownames(var = "sample_label")

species_heatmap_data <- t(species_heatmap_data_vertical)

7.2 Use ComplexHeatmap

See Web site

library(ComplexHeatmap)

# Palette de couleurs

reds = colorRampPalette(c("grey95", "orange", "red3"))
couleurs = reds(10)

pdf(file = "pdf/Heatmap horizontal.pdf", width = 15, height = 6)
Heatmap(as.matrix(species_heatmap_data), col = couleurs, clustering_distance_columns = function(m) vegan::vegdist(m), 
    cluster_rows = TRUE, cluster_columns = TRUE, show_column_dend = TRUE, show_row_dend = FALSE, 
    column_dend_height = unit(30, "mm"), column_title = "Sample", row_title = "", 
    column_title_side = "top", row_title_side = "left", row_title_gp = gpar(fontsize = 12, 
        fontface = "plain"), column_title_gp = gpar(fontsize = 12, fontface = "plain"), 
    column_names_gp = gpar(fontsize = 8, fontface = "plain"), column_names_side = "top", 
    column_dend_side = "bottom", name = "%", heatmap_legend_param = list(title_gp = gpar(fontface = "plain")))
while (!is.null(dev.list())) dev.off()


pdf(file = "pdf/Heatmap vertical.pdf", width = 6, height = 12)
Heatmap(as.matrix(species_heatmap_data_vertical), col = couleurs, split = 7, 
    combined_name_fun = NULL, km_title = "", clustering_distance_rows = function(m) vegan::vegdist(m), 
    clustering_distance_columns = function(m) vegan::vegdist(m), cluster_rows = TRUE, 
    cluster_columns = TRUE, show_column_dend = FALSE, show_row_dend = TRUE, 
    row_dend_width = unit(30, "mm"), column_title = "", row_title = "Sample", 
    column_title_side = "bottom", row_title_side = "right", row_title_gp = gpar(fontsize = 0, 
        fontface = "plain"), column_title_gp = gpar(fontsize = 15, fontface = "plain"), 
    row_names_gp = gpar(fontsize = 8, fontface = "plain"), name = "%", heatmap_legend_param = list(title_gp = gpar(fontface = "plain")))
while (!is.null(dev.list())) dev.off()

Daniel Vaulot

06 09 2018